{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 06 Two-way (column or row) relative frequency tables" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%html\n", "" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "import plotly.graph_objects as go\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import findspark\n", "\n", "findspark.init()\n", "from pyspark.context import SparkContext\n", "from pyspark.sql import functions as F\n", "from pyspark.sql.session import SparkSession\n", "\n", "spark = SparkSession.builder.appName(\"statistics\").master(\"local\").getOrCreate()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[khanacademy](https://www.khanacademy.org/math/ap-statistics/analyzing-categorical-ap/stats-two-way-tables/v/two-way-relative-frequency-tables?modal=1)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "![Two-way (column or row) relative frequency tables fig 1](./imgs/01-05-01.png)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "dataset = {\n", " \"Car\": 28 * [\"SUV\"] + 35 * [\"Sport car\"] + 97 * [\"SUV\"] + 104 * [\"Sport car\"],\n", " \"Accident\": 28 * [\"yes\"] + 35 * [\"yes\"] + 97 * [\"no\"] + 104 * [\"no\"],\n", "}" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CarAccident
0SUVyes
1SUVyes
2SUVyes
3SUVyes
4SUVyes
.........
259Sport carno
260Sport carno
261Sport carno
262Sport carno
263Sport carno
\n", "

264 rows × 2 columns

\n", "
" ], "text/plain": [ " Car Accident\n", "0 SUV yes\n", "1 SUV yes\n", "2 SUV yes\n", "3 SUV yes\n", "4 SUV yes\n", ".. ... ...\n", "259 Sport car no\n", "260 Sport car no\n", "261 Sport car no\n", "262 Sport car no\n", "263 Sport car no\n", "\n", "[264 rows x 2 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(dataset)\n", "df" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---+--------+\n", "|Car|Accident|\n", "+---+--------+\n", "|SUV| yes|\n", "|SUV| yes|\n", "|SUV| yes|\n", "|SUV| yes|\n", "|SUV| yes|\n", "|SUV| yes|\n", "|SUV| yes|\n", "|SUV| yes|\n", "|SUV| yes|\n", "|SUV| yes|\n", "|SUV| yes|\n", "|SUV| yes|\n", "|SUV| yes|\n", "|SUV| yes|\n", "|SUV| yes|\n", "|SUV| yes|\n", "|SUV| yes|\n", "|SUV| yes|\n", "|SUV| yes|\n", "|SUV| yes|\n", "+---+--------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "sdf = spark.createDataFrame(zip(*dataset.values()), schema=list(dataset.keys()))\n", "sdf.registerTempTable(\"sdf_table\")\n", "sdf.show()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Accident Car \n", "no SUV 97\n", " Sport car 104\n", "yes SUV 28\n", " Sport car 35\n", "dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby([\"Accident\", \"Car\"]).size()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+---------+-----+\n", "|Accident| Car|count|\n", "+--------+---------+-----+\n", "| no| SUV| 97|\n", "| yes|Sport car| 35|\n", "| no|Sport car| 104|\n", "| yes| SUV| 28|\n", "+--------+---------+-----+\n", "\n" ] } ], "source": [ "sdf.groupby(\"Accident\", \"Car\").count().show()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+---------+-----+\n", "|Accident| Car|count|\n", "+--------+---------+-----+\n", "| no| SUV| 97|\n", "| yes|Sport car| 35|\n", "| no|Sport car| 104|\n", "| yes| SUV| 28|\n", "+--------+---------+-----+\n", "\n" ] } ], "source": [ "spark.sql(\n", " \"select Accident, Car, count(*) as count from sdf_table group by Accident, Car\"\n", ").show()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CarSUVSport car
Accident
no97104
yes2835
\n", "
" ], "text/plain": [ "Car SUV Sport car\n", "Accident \n", "no 97 104\n", "yes 28 35" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "two_way_table = pd.crosstab(df[\"Accident\"], df[\"Car\"])\n", "two_way_table" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------------+---+---------+\n", "|Accident_Car|SUV|Sport car|\n", "+------------+---+---------+\n", "| yes| 28| 35|\n", "| no| 97| 104|\n", "+------------+---+---------+\n", "\n" ] } ], "source": [ "s_two_way_table = sdf.crosstab(\"Accident\", \"Car\")\n", "s_two_way_table.show()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CarSUVSport car
Accident
no0.7760.748201
yes0.2240.251799
\n", "
" ], "text/plain": [ "Car SUV Sport car\n", "Accident \n", "no 0.776 0.748201\n", "yes 0.224 0.251799" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "freq_table = two_way_table.copy()\n", "freq_table[\"SUV\"] = two_way_table[\"SUV\"] / two_way_table[\"SUV\"].sum()\n", "freq_table[\"Sport car\"] = two_way_table[\"Sport car\"] / two_way_table[\"Sport car\"].sum()\n", "freq_table" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------------+-----+------------------+\n", "|Accident_Car| SUV| Sport car|\n", "+------------+-----+------------------+\n", "| yes|0.224|0.2517985611510791|\n", "| no|0.776|0.7482014388489209|\n", "+------------+-----+------------------+\n", "\n" ] } ], "source": [ "s_freq_table = s_two_way_table\n", "s_freq_table = s_freq_table.withColumn(\n", " \"SUV\", F.col(\"SUV\") / s_freq_table.select(F.sum(\"SUV\")).collect()[0][0]\n", ")\n", "s_freq_table = s_freq_table.withColumn(\n", " \"Sport car\",\n", " F.col(\"Sport car\") / s_freq_table.select(F.sum(\"Sport car\")).collect()[0][0],\n", ")\n", "s_freq_table.show()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CarSUVSport car
Accident
no0.7760.748201
yes0.2240.251799
Total1.0001.000000
\n", "
" ], "text/plain": [ "Car SUV Sport car\n", "Accident \n", "no 0.776 0.748201\n", "yes 0.224 0.251799\n", "Total 1.000 1.000000" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "freq_table = freq_table.append(freq_table.sum().rename(\"Total\"))\n", "freq_table" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 4 }